Importing Datasets and Libraries

library(readr)
library(dplyr)
library(glmnet)
library(caret)
library(fastDummies)
library(tidyr)
library(ModelMetrics)
library(ggplot2)
library(lmtest)
library(MASS)

hosp_df = read_csv("new_hospital_with_wages.csv")
mun_df = read_csv("new_municipalities_with_wages.csv")
sch_df = read_csv("new_school_with_wages.csv")

Hospital Dataset

set.seed(1)

# dropping sector, employer, salary paid, and taxable benefits columns
drop = c("Sector","Employer", "Salary Paid", "Taxable Benefits") 
  
hosp_df = hosp_df[,!(names(hosp_df) %in% drop)] %>%
  drop_na()

# creating dummy variables for job title
df = dummy_cols(hosp_df, remove_selected_columns = TRUE)

n = nrow(df)*0.7
random_index = sample(nrow(df), n) # creating a 70-30 partition of training and testing data
hosp_train = df[random_index, ]
hosp_test = df[-random_index, ] 

x_train_h = data.matrix(hosp_train[-5])
y_train_h = as.vector(unlist(hosp_train[5])) # selecting total compensation as the response variable

x_test_h = data.matrix(hosp_test[-5])
y_test_h = as.vector(unlist(hosp_test[5]))

LINE Assumptions

# cook's distance/influential points
lm_hosp = lm(y_train_h ~ x_train_h)

# residual plot to test for linearity and equal variance
res = resid(lm_hosp)
plot(fitted(lm_hosp), res)
abline(0, 0)

bptest(lm_hosp)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_hosp
## BP = 4439.5, df = 9, p-value < 2.2e-16
# QQ plot to test normal assumption
qqnorm(res)
qqline(res)

# Performing Box-Cox transformation to create the equal variance and normality
bc_data_hosp = data.frame(y = y_train_h, x_train_h)
bc = boxcox(y ~ . , data = bc_data_hosp)

lambda = bc$x[which.max(bc$y)] # finding ideal lambda for boxcox 

box_model_hosp = lm(((y^(lambda)-1)/(y)) ~ ., data = bc_data_hosp)

# observing LINE assumptions for boxcox transformed data
res_hosp = resid(box_model_hosp)
plot(fitted(box_model_hosp), res_hosp)
abline(0, 0)

bptest(box_model_hosp)
## 
##  studentized Breusch-Pagan test
## 
## data:  box_model_hosp
## BP = 15644, df = 9, p-value < 2.2e-16
qqnorm(res_hosp)
qqline(res_hosp)

Box Cox Transformation onto Data

y_train_h = (y_train_h^(lambda)-1)/(y_train_h)
y_test_h = (y_test_h^(lambda)-1)/(y_test_h)

Creating a Lasso Regression with k-fold cv

# Lasso - utilize the training partition to perform validation of results
# selecting lambda using cross validation
set.seed(1)
lasso.cv.out_h = cv.glmnet(x = x_train_h, y = y_train_h, alpha = 1) 
# plot(lasso.cv.out)

train_hosp = predict(lasso.cv.out_h, s = lasso.cv.out_h$lambda.min, newx = x_train_h)
rmse_hosp_train = rmse(y_train_h, train_hosp)
# rmse_hosp_train = 1.599515e-06

# using lambda from smallest cross validation error
pred_hosp = predict(lasso.cv.out_h, s = lasso.cv.out_h$lambda.min, newx = x_test_h)
rmse_hosp = rmse(y_test_h, pred_hosp)
# rmse_hosp = 1.600682e-06

# R2
rss = sum((pred_hosp - y_test_h) ^ 2)  ## residual sum of squares
tss = sum((y_test_h - mean(y_test_h)) ^ 2)  ## total sum of squares
rsq_hosp = 1 - rss/tss
# rsq = 0.3077149

Visualizing the Predictions

comp_data = data.frame('True' = y_test_h, 'Pred' = pred_hosp)

ggplot() +
  geom_point(aes(x = s1, y = True), data = comp_data, color = "blue")

Municipalities Dataset

set.seed(1)

# dropping sector, employer, salary paid, and taxable benefits columns
drop = c("Sector","Employer", "Salary Paid", "Taxable Benefits") 
  
mun_df = mun_df[,!(names(mun_df) %in% drop)] %>%
  drop_na()

# creating dummy variables for job title
df = dummy_cols(mun_df, remove_selected_columns = TRUE)

n = nrow(df)*0.7
random_index = sample(nrow(df), n) # creating a 70-30 partition of training and testing data
mun_train = df[random_index, ]
mun_test = df[-random_index, ] 

x_train_m = data.matrix(mun_train[-5])
y_train_m = as.vector(unlist(mun_train[5])) # selecting total compensation as the response variable

x_test_m = data.matrix(mun_test[-5])
y_test_m = as.vector(unlist(mun_test[5]))

LINE Assumptions

# cook's distance/influential points
lm_mun = lm(y_train_m ~ x_train_m)

# residual plot to test for linearity and equal variance
res = resid(lm_mun)
plot(fitted(lm_mun), res)
abline(0, 0)

bptest(lm_mun)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_mun
## BP = 15.805, df = 9, p-value = 0.07106
# QQ plot to test normal assumption
qqnorm(res)
qqline(res)

# Performing Box-Cox transformation to create the equal variance and normality
bc_data_mun = data.frame(y = y_train_m, x_train_m)
bc = boxcox(y ~ . , data = bc_data_mun)

lambda = bc$x[which.max(bc$y)] # finding ideal lambda for boxcox 

box_model_mun = lm(((y^(lambda)-1)/(y)) ~ ., data = bc_data_mun)

# observing LINE assumptions for boxcox transformed data
res_mun = resid(box_model_mun)
plot(fitted(box_model_mun), res_mun)
abline(0, 0)

bptest(box_model_mun)
## 
##  studentized Breusch-Pagan test
## 
## data:  box_model_mun
## BP = 7310.9, df = 9, p-value < 2.2e-16
qqnorm(res_mun)
qqline(res_mun)

Box Cox Transformation onto Data

y_train_m = (y_train_m^(lambda)-1)/(y_train_m)
y_test_m = (y_test_m^(lambda)-1)/(y_test_m)

Creating a Lasso Regression with k-fold cv

# Lasso - utilize the training partition to perform validation of results
# selecting lambda using cross validation
set.seed(1)
lasso.cv.out_m = cv.glmnet(x = x_train_m, y = y_train_m, alpha = 1) 
# plot(lasso.cv.out)

train_mun = predict(lasso.cv.out_m, s = lasso.cv.out_m$lambda.min, newx = x_train_m)
rmse_mun_train = rmse(y_train_m, train_mun)
# rmse_mun_train = 1.018185e-06

# using lambda from smallest cross validation error
pred_mun = predict(lasso.cv.out_m, s = lasso.cv.out_m$lambda.min, newx = x_test_m)
rmse_mun = rmse(y_test_m, pred_mun)
# rmse_mun = 1.018734e-06

# R2
rss = sum((pred_mun - y_test_m) ^ 2)  ## residual sum of squares
tss = sum((y_test_m - mean(y_test_m)) ^ 2)  ## total sum of squares
rsq_mun = 1 - rss/tss
# rsq = 0.1665321

Visualizing the Predictions

comp_data = data.frame('True' = y_test_m, 'Pred' = pred_mun)

ggplot() +
  geom_point(aes(x = s1, y = True), data = comp_data, color = "blue")

School Dataset

set.seed(1)

# dropping sector, employer, salary paid, and taxable benefits columns
drop = c("Sector","Employer", "Salary Paid", "Taxable Benefits") 
  
sch_df = sch_df[,!(names(sch_df) %in% drop)] %>%
  drop_na()

# creating dummy variables for job title
df = dummy_cols(sch_df, remove_selected_columns = TRUE)

# creating train and test partitions
n = nrow(df)*0.7
random_index = sample(nrow(df), n) # creating a 70-30 partition of training and testing data
sch_train = df[random_index, ]
sch_test = df[-random_index, ]

x_train_s = data.matrix(sch_train[-5])
y_train_s = as.vector(unlist(sch_train[5])) # selecting total compensation as the response variable

x_test_s = data.matrix(sch_test[-5])
y_test_s = as.vector(unlist(sch_test[5]))

LINE Assumptions

# cook's distance/influential points
lm_sch = lm(y_train_s ~ x_train_s)

# residual plot to test for linearity and equal variance
res = resid(lm_sch)
plot(fitted(lm_sch), res)
abline(0, 0)

bptest(lm_sch)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_sch
## BP = 862.56, df = 9, p-value < 2.2e-16
# QQ plot to test normal assumption
qqnorm(res)
qqline(res)

# Performing Box-Cox transformation to create the equal variance and normality
bc_data_sch = data.frame(y = y_train_s, x_train_s)
bc = boxcox(y ~ . , data = bc_data_sch)

lambda = bc$x[which.max(bc$y)] # finding ideal lambda for boxcox 

box_model_sch = lm(((y^(lambda)-1)/(y)) ~ ., data = bc_data_sch)

# observing LINE assumptions for boxcox transformed data
res_sch = resid(box_model_sch)
plot(fitted(box_model_sch), res_sch)
abline(0, 0)

bptest(box_model_sch)
## 
##  studentized Breusch-Pagan test
## 
## data:  box_model_sch
## BP = 2566.3, df = 9, p-value < 2.2e-16
qqnorm(res_sch)
qqline(res_sch)

Box Cox Transformation onto Data

y_train_s = (y_train_s^(lambda)-1)/(y_train_s)
y_test_s = (y_test_s^(lambda)-1)/(y_test_s)

Creating a Lasso Regression with k-fold cv

# Lasso - utilize the training partition to perform validation of results
# selecting lambda using cross validation
set.seed(1)
lasso.cv.out_s = cv.glmnet(x = x_train_s, y = y_train_s, alpha = 1) 
# plot(lasso.cv.out)

train_sch = predict(lasso.cv.out_s, s = lasso.cv.out_s$lambda.min, newx = x_train_s)
rmse_sch_train = rmse(y_train_s, train_sch)
# rmse_sch_train = 8.620759e-07

# using lambda from smallest cross validation error
pred_sch = predict(lasso.cv.out_s, s = lasso.cv.out_s$lambda.min, newx = x_test_s)
rmse_sch = rmse(y_test_s, pred_sch)
# rmse_sch = 8.624815e-07

# R2
rss = sum((pred_sch - y_test_s) ^ 2)  ## residual sum of squares
tss = sum((y_test_s - mean(y_test_s)) ^ 2)  ## total sum of squares
rsq_sch = 1 - rss/tss
# rsq = 0.2186296

Visualizing the Predictions

comp_data = data.frame('True' = y_test_s, 'Pred' = pred_sch)

ggplot() +
  geom_point(aes(x = s1, y = True), data = comp_data, color = "blue")

Comparing the Three Lasso Regression Models

# creating a new dataframe
regr_comp = data.frame(
  sector = c("hospitals", "municipalities", "schools"),
  train_rmse = c(rmse_hosp_train, rmse_mun_train, rmse_sch_train),
  test_rmse = c(rmse_hosp, rmse_mun, rmse_sch),
  R2 = c(rsq_hosp, rsq_mun, rsq_sch)
)
regr_comp
##           sector   train_rmse    test_rmse        R2
## 1      hospitals 1.599515e-06 1.600682e-06 0.3077149
## 2 municipalities 1.018185e-06 1.018734e-06 0.1665321
## 3        schools 8.620759e-07 8.624815e-07 0.2186296